dNLS Preprocessing QC statistics ¶

October 2024¶

In [1]:
import os
NOVA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps'
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/spd/logs/dNLS")
PLOT_PATH = os.path.join(NOVA_HOME, 'src', 'preprocessing', 'notebooks','figures','dNLS')

NOVA_HOME = '/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA'
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_HOME, 'logs', 'dnls')
PLOT_PATH = os.path.join(NOVA_HOME, 'src', 'preprocessing', 'notebooks','figures','dnls_80pct')



os.chdir(NOVA_HOME)
import pandas as pd
import numpy as np
# plt.rcParams["image.cmap"] = "Set1"
import contextlib
import io
from IPython.display import display, Javascript

from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
                                                run_calc_hist_new
                                                
from tools.preprocessing_tools.qc_reports.qc_config import dnls_panels, dnls_markers, dnls_marker_info, dnls_cell_lines, \
                                                dnls_cell_lines_to_cond, dnls_cell_lines_for_disp, reps, \
                                                dnls_line_colors, dnls_lines_order, dnls_custom_palette, \
                                                dnls_expected_dapi_raw, markers, custom_palette

%load_ext autoreload
%autoreload 2
In [14]:
# choose batches
batches = [f'batch{i}' for i in range(3,6)]
batches
Out[14]:
['batch3', 'batch4', 'batch5']
In [15]:
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches)
# we are not using the wt line eventually
df = df[df.cell_line != 'WT']

df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
# we need to match between the raw marker name (TDP43) and the processed marker name (TDP43N / TDP43B)
df_target.loc[df_target['marker'] == 'TDP43', 'marker'] += df_target['panel'].str.replace('panel', '')
reading logs of batch4
reading logs of batch3
reading logs of batch5

Total of 3 files were read.
Before dup handeling  (76919, 21)
After duplication removal #1: (76919, 22)
After duplication removal #2: (76919, 22)

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [16]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'SpinningDisk','deltaNLS_sort')

raws = run_validate_folder_structure(root_directory_raw, False, dnls_panels, dnls_markers.copy(),PLOT_PATH, dnls_marker_info,
                                    dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
                                     batches=batches, fig_width=3)
batch3
Folder structure is invalid. Missing 1 paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/TDP43/panelN
No bad files are found.
Total Sites:  17200
No description has been provided for this image
========
batch4
Folder structure is invalid. Missing 1 paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/TDP43/panelN
No bad files are found.
Total Sites:  17200
No description has been provided for this image
========
batch5
Folder structure is valid.
No bad files are found.
Total Sites:  18000
No description has been provided for this image
========
====================

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [17]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'spd2',
                              'SpinningDisk','deltaNLS_80pct')
procs = run_validate_folder_structure(root_directory_proc, True, dnls_panels, dnls_markers,PLOT_PATH,dnls_marker_info,
                                    dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
                                     batches=batches, fig_width=3)
batch3
Folder structure is invalid. Missing 4 paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/dox/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/dox/TDP43B
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/Untreated/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/Untreated/TDP43B
No bad files are found.
Total Sites:  16748
No description has been provided for this image
========
batch4
Folder structure is invalid. Missing 4 paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/dox/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/dox/TDP43B
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/Untreated/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/Untreated/TDP43B
No bad files are found.
Total Sites:  16634
No description has been provided for this image
========
batch5
Folder structure is invalid. Missing 4 paths:
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/dox/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/dox/TDP43B
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/Untreated/TDP43N
/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/Untreated/TDP43B
No bad files are found.
Total Sites:  17125
No description has been provided for this image
========
====================

Difference between Raw and Processed¶

In [18]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=3)
batch3
No description has been provided for this image
========
batch4
No description has been provided for this image
========
batch5
No description has been provided for this image
========

Variance in each batch (of processed files)¶

In [10]:
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, cond_count=2, rep_count=len(reps), 
                                       num_markers=len(dnls_markers))
    print(f'{batch} var: ',var)
batch3 var:  0.010259739259552004
batch4 var:  0.010513111541071974
batch5 var:  0.010152732140884965

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [19]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, dnls_line_colors, dnls_panels, reps, figsize=(3,5))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [20]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, dnls_line_colors, 
                                                           dnls_panels, reps, figsize=(3,5))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.

In [21]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, dnls_line_colors, dnls_panels, 
                                                     reps, figsize=(3,5))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [22]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling, dnls_markers,figsize=(3,8))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Statistics About the Processed Files¶

In [23]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, dnls_markers)

Total tiles¶

In [24]:
markers_for_dnls = markers.copy()
markers_for_dnls.remove('TIA1')
markers_for_dnls += ['TDP43B']
total_sum[total_sum.marker.isin(markers_for_dnls)].n_valid_tiles.sum()
Out[24]:
468671

Total whole nuclei in tiles¶

In [25]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[25]:
125177.0

Total nuclei in sites¶

In [26]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[26]:
328744.0
In [27]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch3
count 172.000000 172.000000 172.000000 172.00000
mean 1095.517442 10.955174 826.500000 2144.02907
std 125.228982 1.252290 86.795754 247.21836
min 816.000000 8.160000 624.000000 1548.00000
25% 1014.000000 10.140000 782.500000 1961.00000
50% 1089.500000 10.895000 822.000000 2160.50000
75% 1196.000000 11.960000 896.000000 2320.75000
max 1328.000000 13.280000 1004.000000 2644.00000
sum 188429.000000 NaN 142158.000000 368773.00000
expected_count 450.000000 450.000000 450.000000 450.00000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch4
count 172.000000 172.000000 172.000000 172.000000
mean 1021.773256 10.217733 750.988372 2005.116279
std 168.849844 1.688498 123.873874 326.967242
min 580.000000 5.800000 454.000000 1110.000000
25% 909.000000 9.090000 655.000000 1788.000000
50% 1037.000000 10.370000 758.500000 2039.000000
75% 1138.000000 11.380000 841.000000 2202.750000
max 1328.000000 13.280000 995.000000 2614.000000
sum 175745.000000 NaN 129170.000000 344880.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch5
count 180.000000 180.000000 180.000000 180.000000
mean 1027.938889 10.279389 769.911111 2013.633333
std 150.921977 1.509220 110.270986 299.418653
min 644.000000 6.440000 497.000000 1291.000000
25% 924.250000 9.242500 698.500000 1771.000000
50% 1061.000000 10.610000 786.000000 2064.000000
75% 1129.000000 11.290000 845.000000 2196.000000
max 1301.000000 13.010000 1012.000000 2544.000000
sum 185029.000000 NaN 138584.000000 362454.000000
expected_count 450.000000 450.000000 450.000000 450.000000
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 524.000000 524.000000 524.000000 5.240000e+02
mean 1048.097328 10.480973 782.274809 2.053639e+03
std 152.814238 1.528142 112.515547 2.993880e+02
min 580.000000 5.800000 454.000000 1.110000e+03
25% 937.000000 9.370000 705.000000 1.846000e+03
50% 1067.000000 10.670000 794.000000 2.102000e+03
75% 1172.000000 11.720000 861.000000 2.267000e+03
max 1328.000000 13.280000 1012.000000 2.644000e+03
sum 549203.000000 NaN 409912.000000 1.076107e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02

Show Total Tile Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [28]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of tiles', show_sum=True, figsize=(3,8))
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image

Show Total Whole Cell Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [29]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of whole cells', show_sum=True, figsize=(3,8))
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image
No description has been provided for this image
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
No description has been provided for this image

Show Cell Count Statistics per Batch¶

In [30]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)')

plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site')

plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Show Tiles per Site Statistics¶

In [31]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[31]:
cell_line_cond
TDP43 Untreated     9.59600
TDP43 dox          11.36725
Name: n_valid_tiles, dtype: float64
In [32]:
df_dapi[['site_cell_count']].mean()
Out[32]:
site_cell_count    20.5465
dtype: float64
In [33]:
plot_catplot(df_dapi, custom_palette,reps, x='n_valid_tiles', x_title='valid tiles count', batch_min=3, batch_max=5, height=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1017: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
No description has been provided for this image

Show Mean of cell count in valid tiles¶

In [34]:
plot_hm(df_dapi, split_by='rep', rows='cell_line_cond', columns='panel', figsize=(10,3))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Assessing Staining Reproducibility and Outliers¶

In [35]:
dnls_cell_lines_for_disp
Out[35]:
{'TDP43_dox': 'TDP43_dox', 'TDP43_Untreated': 'TDP43_Untreated'}
In [36]:
for batch in batches:
    print(batch)
    run_calc_hist_new(f'{batch}', dnls_cell_lines_for_disp, dnls_markers,
                      root_directory_raw, root_directory_proc,
                           hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
    print("="*30)
batch3
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==============================
batch4
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==============================
batch5
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
==============================
In [37]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_dNLS_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_dNLS_80pct.html')
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

optional arguments:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: kernel kernelspec migrate run troubleshoot

Jupyter command `jupyter-nbconvert` not found.
Out[37]:
256
In [ ]: